library(tidyverse)
library(ggplot2)
library(broom)
library(readxl)
library(tidytext)
library(dplyr)
library(tm)
library(SnowballC)
library(lubridate)
library(plotly)
library(ggpmisc)
Palestine_news_articles <- read_xlsx("../data/Palestine_news_articles.xlsx")
## New names:
## • `` -> `...2`
## • `` -> `...3`
## • `` -> `...4`
## • `` -> `...5`
Israel_news_articles <- read_xlsx("../data/israel_news_articles.xlsx")
## New names:
## • `` -> `...2`
## • `` -> `...3`
## • `` -> `...4`
## • `` -> `...5`
Gaza_news_articles <- read_xlsx("../data/gaza_news_articles.xlsx")
## New names:
## • `` -> `...2`
## • `` -> `...3`
## • `` -> `...4`
## • `` -> `...5`
chat_gpt_article_headlines <- read_xlsx("../data/chat_gpt_data (1).xlsx")
additional_news_data <- read_csv("../data/news_data.csv")
## Rows: 3338 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (3): headline, description, date
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
million_news <- read_csv("../data/abcnews-date-text 2 (1).csv")
## Rows: 1244184 Columns: 2
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): headline_text
## dbl (1): publish_date
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
tidy_pna <- Palestine_news_articles %>% 
  select("...2", "...3", "...4", "...5") %>% 
  rename("title" = "...2",
         "journal" = "...3",
         "date" = "...4",
         "description" = "...5") %>% 
  filter(!is.na(title)) %>% 
  filter(title != "Title",
         journal != "Journal",
         date != "Date",
         description != "Description") %>% 
  filter(!str_starts(journal, "#")) %>% 
  mutate(keyword = "palestine") %>% 
  distinct()
tidy_ina <- Israel_news_articles %>% 
  select("...2", "...3", "...4", "...5") %>% 
  rename("title" = "...2",
         "journal" = "...3",
         "date" = "...4",
         "description" = "...5") %>% 
  filter(!is.na(title)) %>% 
  filter(title != "Title",
         journal != "Journal",
         date != "Date",
         description != "Description") %>% 
  filter(!str_starts(journal, "#")) %>% 
  mutate(keyword = "israel") %>% 
  distinct()
tidy_gna <- Gaza_news_articles %>% 
  select("...2", "...3", "...4", "...5") %>% 
  rename("title" = "...2",
         "journal" = "...3",
         "date" = "...4",
         "description" = "...5") %>% 
  filter(!is.na(title)) %>% 
  filter(title != "Title",
         journal != "Journal",
         date != "Date",
         description != "Description") %>% 
  filter(!str_starts(journal, "#")) %>% 
  mutate(keyword = "gaza") %>% 
  distinct()
pidf <- full_join(tidy_pna, tidy_ina)
## Joining with `by = join_by(title, journal, date, description, keyword)`
all_data <- full_join(pidf, tidy_gna)
## Joining with `by = join_by(title, journal, date, description, keyword)`
tidy_pna %>%
  count(journal) %>%
  top_n(10, n) %>%
  ggplot(aes(x = reorder(journal, n), y = n)) +
  geom_col() +
  coord_flip() +
  labs(x = "Journal", y = "Frequency") +
  theme_minimal()

tidy_ina %>%
  count(journal) %>%
  top_n(10, n) %>%
  ggplot(aes(x = reorder(journal, n), y = n)) +
  geom_col() +
  coord_flip() +
  labs(x = "Journal", y = "Frequency") +
  theme_minimal()

pidf %>%
  group_by(title, journal, description) %>%
  summarize(count_of_string = n()) #%>% 
## `summarise()` has grouped output by 'title', 'journal'. You can override using
## the `.groups` argument.
## # A tibble: 1,004 × 4
## # Groups:   title, journal [983]
##    title                                     journal description count_of_string
##    <chr>                                     <chr>   <chr>                 <int>
##  1 10 books to help you understand Israel a… UNSW S… "10 books …               1
##  2 100-200,000, Not Two Million': Israel's … Haaretz "100-200,0…               1
##  3 2913 Palestinian children killed in Gaza… Defens… "34 Palest…               1
##  4 3,268 Israelis evacuated to hospitals si… Anadol… "The Yedio…               1
##  5 326 Palestinian children killed as Israe… Defens… "DCIP has …               1
##  6 34 rockets fired from Lebanon at Israel … The Ti… "The barra…               1
##  7 40 Books to Understand Palestine ‹ Liter… Litera… "From Ghas…               1
##  8 A Left-vs.-Left House Battle, Funded by … The Ne… "Wesley Be…               1
##  9 A Palestinian and an Israeli physician s… The La… "As doctor…               1
## 10 A Prayer for the Israel Palestine Confli… The As… "A Prayer …               1
## # ℹ 994 more rows
  # mutate(count_of_string = n) %>%
  #mutate(keyword = if_else(count_of_string == 1, keyword, "Both"))
inner_join_df <- tidy_pna %>% 
  inner_join(tidy_ina, by = c("title", "journal", "description")) %>% 
  mutate(keyword = "Both") %>% 
  select(title, journal, description, keyword)
## Warning in inner_join(., tidy_ina, by = c("title", "journal", "description")): Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 105 of `x` matches multiple rows in `y`.
## ℹ Row 243 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
##   "many-to-many"` to silence this warning.
tokenized_df <- left_join(pidf, inner_join_df, by = c("title", "journal", "description"))
## Warning in left_join(pidf, inner_join_df, by = c("title", "journal", "description")): Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 105 of `x` matches multiple rows in `y`.
## ℹ Row 6 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
##   "many-to-many"` to silence this warning.
  colnames(tokenized_df)[5] ="keyword"
  colnames(tokenized_df)[6] ="in_both"
  
tokenized_df <- tokenized_df %>% 
  distinct() %>% 
  mutate(in_both = if_else(in_both == "Both", T, F))
sentiments <- get_sentiments("bing")
stop_words <- get_stopwords()

tokenized_df <- tokenized_df %>%
  mutate(Title = title) %>%
  unnest_tokens(word, title) %>%
  anti_join(stop_words)
## Joining with `by = join_by(word)`
tokenized_df2 <- tokenized_df %>%
  mutate(Description = description) %>%
  unnest_tokens(word, description) %>%
  anti_join(stop_words)
## Joining with `by = join_by(word)`
tokenized_df <- tokenized_df %>%
  mutate(title_or_description = "title")

tokenized_df2 <- tokenized_df2 %>%
  mutate(title_or_description = "description")

colnames(tokenized_df)[3] ="Description"

All_tokens <- tokenized_df %>%
  full_join(tokenized_df2, by = c("journal", "keyword", "in_both", "Title", "word", "Description", "title_or_description"))
tokenized_df %>%
    count(word, sort = TRUE) %>%
    slice_max(n, n = 20) %>%
    ggplot(aes(n, fct_reorder(word, n))) +
    geom_col()

# data$word <- wordStem(data$word)
tokenized_df %>% 
  inner_join(sentiments) %>% 
  group_by(sentiment) %>% 
  filter(in_both == F) %>% 
  filter(keyword == "palestine") %>% 
  count(sentiment, word, sort = T) %>% 
  filter(sentiment == "positive")
## Joining with `by = join_by(word)`
## # A tibble: 0 × 3
## # Groups:   sentiment [0]
## # ℹ 3 variables: sentiment <chr>, word <chr>, n <int>
tokenized_df %>% 
  inner_join(sentiments) %>% 
  group_by(sentiment) %>% 
  filter(in_both == F) %>% 
  filter(keyword == "israel") %>% 
  count(sentiment, word, sort = T) %>% 
  filter(sentiment == "positive")
## Joining with `by = join_by(word)`
## # A tibble: 0 × 3
## # Groups:   sentiment [0]
## # ℹ 3 variables: sentiment <chr>, word <chr>, n <int>
pidf %>% 
  unnest_tokens(word, title) %>%
  anti_join(stop_words) %>% 
  group_by(journal) %>% 
  count(word, sort = TRUE) %>% 
  mutate(total_words = sum(n)) %>% 
  mutate(word_percentage = n/total_words) %>% 
  #arrange(desc(word_percentage)) %>% 
  mutate(weighted_percentage = word_percentage * log(total_words)) %>%  # Applying weighting to 
  #arrange(desc(weighted_percentage)) %>% 
  arrange(desc(word_percentage)) %>% 
  #filter(!(word %in% c("palestine", "israel", "gaza")))
  filter(word == "hamas") %>% 
  filter(n >= 100)
## Joining with `by = join_by(word)`
## # A tibble: 0 × 6
## # Groups:   journal [0]
## # ℹ 6 variables: journal <chr>, word <chr>, n <int>, total_words <int>,
## #   word_percentage <dbl>, weighted_percentage <dbl>
  #unnest_tokens(word, description)
# All_tokens %>% 
#   group_by(word) %>%  
#   summarize(word_count = n(), journal, date, Description, keyword, in_both, Title, word, title_or_description) %>% 
#   mutate(total_words = sum(n)) %>% 
#   mutate(word_percentage = n/total_words) %>% 
#   mutate(weighted_percentage = word_percentage * log(total_words)) %>%
#   arrange(desc(word_percentage)) %>% 
#   group_by(word)
tokenized_df %>% 
  group_by(keyword) %>% 
  count(word, sort = TRUE) %>% 
  filter(word %in% c("palestine", "israel")) %>% 
  ggplot(aes(x = keyword, y = n, fill = word)) +
  geom_col(position = "dodge") +
  coord_flip() +
  labs(title = "Words Frequency in Title")

tokenized_df2 %>% 
  group_by(keyword) %>% 
  count(word, sort = TRUE) %>% 
  filter(word %in% c("palestine", "israel")) %>% 
  ggplot(aes(x = keyword, y = n, fill = word)) +
  geom_col(position = "dodge") +
  coord_flip() +
  labs(title = "Words Frequency in Description")

All_tokens %>% 
  group_by(keyword, title_or_description) %>% 
  count(word, sort = TRUE) %>% 
  filter(word %in% c("palestine", "israel")) %>% 
  ggplot(aes(x = keyword, y = n, fill = word)) +
  geom_col(position = "dodge") +
  coord_flip() +
  labs(title = 'Frequency of Words "Israel" and "Palestine"') +
  xlab("News Article Subject") +
  ylab("Frequency") +
  facet_wrap(~title_or_description, nrow = 2)

All_tokens %>% 
  group_by(keyword, title_or_description) %>% 
  count(word, sort = TRUE) %>% 
  filter(word %in% c("gaza", "west", "bank")) %>% 
  ggplot(aes(x = keyword, y = n, fill = word)) +
  geom_col(position = "dodge") +
  coord_flip() +
  labs(title = 'Frequency of Words "Israel" and "Palestine"') +
  xlab("News Article Subject") +
  ylab("Frequency") +
  facet_wrap(~title_or_description, nrow = 2)

gpt <- chat_gpt_article_headlines

pgpt <- chat_gpt_article_headlines %>% 
  select(Palestine_headings)

igpt <- chat_gpt_article_headlines %>% 
  select(Israel_headings)
pgpt %>% 
  unnest_tokens(word, Palestine_headings) %>% 
  anti_join(stop_words) %>% 
  inner_join(sentiments) %>% 
  group_by(sentiment) %>% 
  count(sentiment, word, sort = T)
## Joining with `by = join_by(word)`
## Joining with `by = join_by(word)`
## # A tibble: 246 × 3
## # Groups:   sentiment [2]
##    sentiment word            n
##    <chr>     <chr>       <int>
##  1 negative  conflict      158
##  2 positive  peace          90
##  3 positive  innovation     73
##  4 negative  crisis         67
##  5 positive  sustainable    47
##  6 positive  unity          37
##  7 positive  wins           31
##  8 negative  struggle       28
##  9 positive  renewed        28
## 10 positive  advocate       24
## # ℹ 236 more rows
igpt %>% 
  unnest_tokens(word, Israel_headings) %>% 
  anti_join(stop_words) %>% 
  inner_join(sentiments) %>% 
  group_by(sentiment) %>% 
  count(sentiment, word, sort = T)
## Joining with `by = join_by(word)`
## Joining with `by = join_by(word)`
## # A tibble: 167 × 3
## # Groups:   sentiment [2]
##    sentiment word             n
##    <chr>     <chr>        <int>
##  1 negative  condemns       135
##  2 positive  breakthrough    96
##  3 positive  innovation      65
##  4 positive  peace           61
##  5 positive  innovative      52
##  6 positive  diplomatic      42
##  7 positive  sustainable     37
##  8 negative  thwart          34
##  9 positive  support         33
## 10 positive  boost           30
## # ℹ 157 more rows
distinct_igpt <- igpt %>% 
  distinct()

distinct_pgpt <- pgpt %>% 
  distinct()
palestine_million <- million_news %>% 
  filter(str_detect(headline_text, fixed("palestine", ignore_case = T)) |
           str_detect(headline_text, fixed("palestinian", ignore_case = T)) |
           str_detect(headline_text, fixed("gaza", ignore_case = T)) |
           str_detect(headline_text, fixed("west bank", ignore_case = T))) %>% 
  mutate(keyword = "palestine")

israel_million <- million_news %>% 
  filter(str_detect(headline_text, fixed("israel", ignore_case = T))) %>% 
  mutate(keyword = "israel")
           #str_detect(headline_text, fixed("tel aviv", ignore_case = T)))

million_df <- full_join(palestine_million, israel_million, by = c("publish_date", "headline_text", "keyword"))
additional_news_data <- additional_news_data %>%
  rename(title = headline) %>% 
  mutate(source = "additional_news_data")

million_df <- million_df %>%
  rename(title = headline_text,
         date = publish_date) %>% 
  mutate(source = "million_news")

million_df$date <- ymd(million_df$date)

million_df$date <- format(million_df$date, "%d-%m-%Y")

joining <- full_join(additional_news_data, million_df, by = c("title", "date", "source"))
pidf <- pidf %>% 
  mutate(source = "my_scaped_data")

all_real_data <- pidf %>% 
  #select(!date) %>% 
  full_join(joining, by = c("title", "description", "keyword", "source")) %>% 
  rename(date = date.y, untidy_date = date.x) 


tokenized_df3 <- all_real_data %>%
  mutate(Title = title) %>%
  unnest_tokens(word, title) %>%
  anti_join(stop_words)
## Joining with `by = join_by(word)`
tokenized_df4 <- all_real_data %>%
  mutate(Description = description) %>%
  unnest_tokens(word, description) %>%
  anti_join(stop_words)
## Joining with `by = join_by(word)`
tokenized_df3 <- tokenized_df3 %>%
  mutate(title_or_description = "title")

tokenized_df4 <- tokenized_df4 %>%
  mutate(title_or_description = "description")

colnames(tokenized_df3)[3] ="Description"
colnames(tokenized_df4)[1] ="Title"

all_real_data_tokens <- tokenized_df3 %>%
  full_join(tokenized_df4)
## Joining with `by = join_by(journal, untidy_date, Description, keyword, source,
## date, Title, word, title_or_description)`
all_real_data_tokens %>%
  filter(!is.na(word)) %>% 
  filter(!is.na(keyword)) %>% 
  group_by(keyword) %>%
  filter(keyword == "israel") 
## # A tibble: 33,833 × 9
## # Groups:   keyword [1]
##    journal  untidy_date  Description            keyword source date  Title word 
##    <chr>    <chr>        <chr>                  <chr>   <chr>  <chr> <chr> <chr>
##  1 Fox News 22 hours ago FBI Director Christop… israel  my_sc… <NA>  FBI … fbi  
##  2 Fox News 22 hours ago FBI Director Christop… israel  my_sc… <NA>  FBI … dire…
##  3 Fox News 22 hours ago FBI Director Christop… israel  my_sc… <NA>  FBI … wray 
##  4 Fox News 22 hours ago FBI Director Christop… israel  my_sc… <NA>  FBI … makes
##  5 Fox News 22 hours ago FBI Director Christop… israel  my_sc… <NA>  FBI … surp…
##  6 Fox News 22 hours ago FBI Director Christop… israel  my_sc… <NA>  FBI … isra…
##  7 Fox News 22 hours ago FBI Director Christop… israel  my_sc… <NA>  FBI … stop 
##  8 Fox News 22 hours ago FBI Director Christop… israel  my_sc… <NA>  FBI … amid 
##  9 Fox News 22 hours ago FBI Director Christop… israel  my_sc… <NA>  FBI … elev…
## 10 Fox News 22 hours ago FBI Director Christop… israel  my_sc… <NA>  FBI … thre…
## # ℹ 33,823 more rows
## # ℹ 1 more variable: title_or_description <chr>
all_real_data_tokens %>%
  filter(!is.na(word)) %>% 
  filter(!is.na(keyword)) %>% 
  group_by(keyword) %>%
  filter(keyword == "palestine")
## # A tibble: 27,798 × 9
## # Groups:   keyword [1]
##    journal      untidy_date  Description        keyword source date  Title word 
##    <chr>        <chr>        <chr>              <chr>   <chr>  <chr> <chr> <chr>
##  1 The Nation   12 hours ago Hundreds of prote… palest… my_sc… <NA>  The … move…
##  2 The Nation   12 hours ago Hundreds of prote… palest… my_sc… <NA>  The … pale…
##  3 The Nation   12 hours ago Hundreds of prote… palest… my_sc… <NA>  The … takes
##  4 The Nation   12 hours ago Hundreds of prote… palest… my_sc… <NA>  The … moma 
##  5 The Guardian 1 day ago    Photos displayed … palest… my_sc… <NA>  Thre… three
##  6 The Guardian 1 day ago    Photos displayed … palest… my_sc… <NA>  Thre… guil…
##  7 The Guardian 1 day ago    Photos displayed … palest… my_sc… <NA>  Thre… terr…
##  8 The Guardian 1 day ago    Photos displayed … palest… my_sc… <NA>  Thre… offe…
##  9 The Guardian 1 day ago    Photos displayed … palest… my_sc… <NA>  Thre… para…
## 10 The Guardian 1 day ago    Photos displayed … palest… my_sc… <NA>  Thre… imag…
## # ℹ 27,788 more rows
## # ℹ 1 more variable: title_or_description <chr>
all_real_data_tokens %>%
  filter(!is.na(word)) %>% 
  filter(!is.na(keyword)) %>% 
  group_by(keyword) %>%
    count(word, sort = TRUE) %>%
    slice_max(n, n = 20) %>%
    ggplot(aes(n, fct_reorder(word, n), fill = keyword)) +
    geom_col()

all_real_data <- all_real_data %>% 
  mutate(year = str_sub(date, 7, 10)) #%>% 
  #mutate(year = as.numeric(year))

all_real_data_tokens <- all_real_data_tokens %>%
  mutate(year = str_sub(date, 7, 10)) %>%
  mutate(year = as.numeric(year))

all_real_data_tokens %>%
  #filter(!is.na(year)) %>%
  count(year) %>%
  ggplot(aes(x = year, y = n)) +
  geom_col() +
  scale_x_continuous(breaks = seq(2003, 2023, by = 1))
## Warning: Removed 1 rows containing missing values (`position_stack()`).

all_real_data_tokens %>% 
  group_by(year) %>% 
  #filter(source == "million_news")
  filter(!is.na(year), !is.na(word)) %>% 
  count(word, sort = TRUE) %>% 
  mutate(total_words = sum(n)) %>% 
  mutate(word_percentage = n/total_words) %>% 
  filter(word == "palestine") %>% 
  group_by(word) %>% 
  ggplot(aes(x = year, y = word_percentage)) +
  geom_line() +
  scale_x_continuous(breaks = seq(2003, 2024, by = 2)) +
  geom_vline(xintercept = 2009, colour = "red") +
  geom_vline(xintercept = 2017, colour = "red") +
  labs(title = )

all_real_data_tokens %>% 
  filter(!is.na(keyword)) %>% 
  group_by(year, keyword) %>% 
  filter(!is.na(year), !is.na(word)) %>% 
  count(word, sort = TRUE) %>% 
  mutate(total_words = sum(n)) %>% 
  mutate(word_percentage = n/total_words) %>% 
  mutate(word_percentage = n/total_words) %>% 
  filter(word %in% c("palestine")) %>% 
  group_by(word) %>% 
  ggplot(aes(x = year, y = word_percentage, colour = keyword)) +
  geom_line()+
  #geom_smooth()+
  scale_x_continuous(breaks = seq(2003, 2024, by = 1))

all_real_data_tokens %>% 
  group_by(year) %>% 
  #filter(source == "million_news") %>% 
  filter(!is.na(year), !is.na(word)) %>% 
  count(word, sort = TRUE) %>% 
  mutate(total_words = sum(n)) %>% 
  mutate(word_percentage = n/total_words) %>% 
  filter(word == "palestine") %>% 
  group_by(word) %>% 
  ggplot(aes(x = year, y = word_percentage)) +
  geom_line() +
  scale_x_continuous(breaks = seq(2003, 2024, by = 1)) +
  geom_vline(xintercept = 2009, colour = "red") +
  geom_vline(xintercept = 2017, colour = "red") +
  labs(title = 'Percentage of the Word "Palestine" in Palestine/Israel News Article Headings Over Time') +
  xlab("Year")+
  ylab("Word Percentage")

It appears that between between 2009 to 2011 there is a large increase in the use of the word palestine particularly in news articles about palestine. Why?

2009-2010: Settlement Freeze U.S. President Barack Obama attempted to revive Israeli-Palestinian peace talks shortly after taking office in 2009. At a speech at Cairo University that year, Obama reiterated his support for a two-state solution. Why It Matters: As part of a good faith gesture, Israeli Prime Minister Benjamin Netanyahu implemented a settlement freeze, a key Palestinian demand, that lasted 10 months. While talks briefly restarted, Palestinian Authority President Mahmoud Abbas aborted the talks.

No American president ever came into office with a better understanding of the tragic history of the Palestinians or a deeper commitment to help them achieve independence than Obama. In his Cairo speech in April 2009, Obama solemnly pledged to do everything in his power to bring about Palestinian statehood. - Al Jazeera

Trump was elected in 2017

The 2013–2014 Israeli–Palestinian peace talks were part of the Israeli–Palestinian peace process. Direct negotiations between Israel and the Palestinians began on 29 July 2013 following an attempt by United States Secretary of State John Kerry to restart the peace process.

all_real_data_tokens %>% 
  group_by(year) %>% 
  filter(!is.na(year), !is.na(word)) %>% 
  count(word, sort = TRUE) %>% 
  mutate(total_words = sum(n)) %>% 
  mutate(word_percentage = n/total_words) %>% 
  filter(word %in% c("terrorism")) %>% 
  group_by(word) %>% 
  ggplot(aes(x = year, y = word_percentage)) +
  geom_line()+
  scale_x_continuous(breaks = seq(2003, 2023, by = 1)) +
  geom_vline(xintercept = 2009, colour = "red") +
  geom_vline(xintercept = 2017, colour = "red") +
  geom_smooth()
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'

all_real_data_tokens %>% 
  group_by(year, keyword) %>% 
  filter(!is.na(year), !is.na(word)) %>% 
  count(word, sort = TRUE) %>% 
  mutate(total_words = sum(n)) %>% 
  mutate(word_percentage = n/total_words) %>% 
  filter(word %in% c("terrorism")) %>% 
  group_by(word) %>% 
  ggplot(aes(x = year, y = word_percentage, colour = keyword)) +
  geom_point()+
  #geom_smooth()+
  scale_x_continuous(breaks = seq(2003, 2023, by = 1))

All_tokens %>% 
  filter(title_or_description == "title")
## # A tibble: 9,293 × 9
##    journal   date.x Description keyword in_both Title word  title_or_description
##    <chr>     <chr>  <chr>       <chr>   <lgl>   <chr> <chr> <chr>               
##  1 The Nati… 12 ho… Hundreds o… palest… NA      The … move… title               
##  2 The Nati… 12 ho… Hundreds o… palest… NA      The … pale… title               
##  3 The Nati… 12 ho… Hundreds o… palest… NA      The … takes title               
##  4 The Nati… 12 ho… Hundreds o… palest… NA      The … moma  title               
##  5 The Guar… 1 day… Photos dis… palest… NA      Thre… three title               
##  6 The Guar… 1 day… Photos dis… palest… NA      Thre… guil… title               
##  7 The Guar… 1 day… Photos dis… palest… NA      Thre… terr… title               
##  8 The Guar… 1 day… Photos dis… palest… NA      Thre… offe… title               
##  9 The Guar… 1 day… Photos dis… palest… NA      Thre… para… title               
## 10 The Guar… 1 day… Photos dis… palest… NA      Thre… imag… title               
## # ℹ 9,283 more rows
## # ℹ 1 more variable: date.y <chr>
All_tokens %>% 
  filter(title_or_description == "description")
## # A tibble: 127,318 × 9
##    journal   date.x Description keyword in_both Title word  title_or_description
##    <chr>     <chr>  <chr>       <chr>   <lgl>   <chr> <chr> <chr>               
##  1 The Nati… <NA>   Hundreds o… palest… NA      The … hund… description         
##  2 The Nati… <NA>   Hundreds o… palest… NA      The … prot… description         
##  3 The Nati… <NA>   Hundreds o… palest… NA      The … occu… description         
##  4 The Nati… <NA>   Hundreds o… palest… NA      The … stor… description         
##  5 The Nati… <NA>   Hundreds o… palest… NA      The … muse… description         
##  6 The Nati… <NA>   Hundreds o… palest… NA      The … remi… description         
##  7 The Nati… <NA>   Hundreds o… palest… NA      The … thea… description         
##  8 The Nati… <NA>   Hundreds o… palest… NA      The … world description         
##  9 The Nati… <NA>   Hundreds o… palest… NA      The … evade description         
## 10 The Nati… <NA>   Hundreds o… palest… NA      The … poli… description         
## # ℹ 127,308 more rows
## # ℹ 1 more variable: date.y <chr>
All_tokens%>%
  filter(!is.na(word), !is.na(keyword), !is.na(title_or_description)) %>% 
  group_by(keyword, title_or_description) %>%
  count(word, sort = TRUE) %>%
  ggplot(aes(x = n, y = n, colour = keyword)) +
  geom_point()

All_tokens %>%
  filter(!is.na(word), !is.na(keyword), !is.na(title_or_description)) %>% 
  group_by(keyword, title_or_description) %>%
  count(word, sort = TRUE) %>%
  ungroup() %>%
  pivot_wider(names_from = title_or_description, values_from = n, values_fill = 0) %>% 
  ggplot(aes(x = title, y = description)) +
  geom_point()

All_tokens$word <- str_replace_all(All_tokens$word, "­", "")
All_tokens %>%
  filter(!is.na(word), !is.na(keyword), !is.na(title_or_description)) %>% 
  group_by(title_or_description) %>%
  count(word, sort = TRUE) %>%
  ungroup() %>% 
  pivot_wider(names_from = title_or_description, values_from = n, values_fill = 0) %>% 
  mutate(total_description = sum(description)) %>%
  mutate(total_title = sum(title)) %>%
  mutate(title_percentage = title/total_title) %>% 
  mutate(description_percentage = description/total_description) %>% 
  filter(title_percentage > 0.0045, description_percentage > 0.0045) %>% 
  ggplot(aes(x = title_percentage, y = description_percentage, label = word)) +
  geom_point() +
  geom_text(hjust = -0.1, vjust = -0.5, size = 3) + # Adjust position and size of text labels
  labs(x = "Word Count in Title", y = "Word Count in Description", title = "Word Counts Compairison") +
  theme_minimal()

All_tokens %>%
  filter(!is.na(word), !is.na(keyword), !is.na(title_or_description)) %>% 
  group_by(title_or_description) %>%
  count(word, sort = TRUE) %>%
  ungroup() %>% 
  pivot_wider(names_from = title_or_description, values_from = n, values_fill = 0) %>% 
  mutate(total_description = sum(description)) %>%
  mutate(total_title = sum(title)) %>%
  mutate(title_percentage = title/total_title) %>% 
  mutate(description_percentage = description/total_description) %>% 
  filter(title_percentage > 0, description_percentage > 0) %>% 
  ggplot(aes(x = title_percentage, y = description_percentage, label = word)) +
  geom_point(alpha = 0.5) +
  labs(x = "Word Count in Title", y = "Word Count in Description", title = "Word Counts Compairison") +
  theme_minimal() +
  geom_smooth(method = "lm") +
  stat_poly_eq(formula = y ~ x, 
               aes(label = paste(after_stat(rr.label)))) +
  geom_abline(intercept = 0, slope = 1, colour = "red") + 
  scale_x_continuous(breaks = seq(0, 0.06, by = 0.01)) +
  scale_y_continuous(breaks = seq(0, 0.06, by = 0.01)) +
  coord_cartesian(xlim = c(0, 0.06), ylim = c(0, 0.06))
## `geom_smooth()` using formula = 'y ~ x'
## Warning: The following aesthetics were dropped during statistical transformation: label
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
##   the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
##   variable into a factor?

All_tokens %>%
  filter(!is.na(word), !is.na(keyword), !is.na(title_or_description)) %>% 
  group_by(title_or_description) %>%
  count(word, sort = TRUE) %>%
  ungroup() %>% 
  pivot_wider(names_from = title_or_description, values_from = n, values_fill = 0) %>% 
  mutate(total_description = sum(description)) %>%
  mutate(total_title = sum(title)) %>%
  mutate(title_percentage = title/total_title) %>% 
  mutate(description_percentage = description/total_description) %>% 
  filter(title_percentage > 0, description_percentage > 0) %>% 
  plot_ly(x = ~title_percentage, y = ~description_percentage, text = ~word) %>%
  add_markers()
 all_real_data_tokens <-  all_real_data_tokens %>% 
  mutate(fixed_date = case_when(
    str_detect(untidy_date, fixed("2023")) ~ 2023,
    str_detect(untidy_date, fixed("2022")) ~ 2022,
    str_detect(untidy_date, fixed("2021")) ~ 2021,
    str_detect(untidy_date, fixed("2020")) ~ 2020,
    str_detect(untidy_date, fixed("2019")) ~ 2019,
    str_detect(untidy_date, fixed("2018")) ~ 2018,
    str_detect(untidy_date, fixed("2017")) ~ 2017,
    str_detect(untidy_date, fixed("2016")) ~ 2016,
    str_detect(untidy_date, fixed("2015")) ~ 2015,
    str_detect(untidy_date, fixed("2014")) ~ 2014,
    str_detect(untidy_date, fixed("2013")) ~ 2013,
    str_detect(untidy_date, fixed("2012")) ~ 2012,
    str_detect(untidy_date, fixed("2011")) ~ 2011,
    str_detect(untidy_date, fixed("2010")) ~ 2010,
    str_detect(untidy_date, fixed("2009")) ~ 2009,
    str_detect(untidy_date, fixed("2008")) ~ 2008,
    str_detect(untidy_date, fixed("2007")) ~ 2007,
    str_detect(untidy_date, fixed("2006")) ~ 2006,
    str_detect(untidy_date, fixed("2005")) ~ 2005,
    str_detect(untidy_date, fixed("2004")) ~ 2004,
    str_detect(untidy_date, fixed("2003")) ~ 2003,
    str_detect(untidy_date, fixed("2002")) ~ 2002,
    str_detect(untidy_date, fixed("2001")) ~ 2001,
    str_detect(untidy_date, fixed("2000")) ~ 2000,
    is.na(untidy_date) ~ NA,
    .default = 2024
  )) %>% 
  mutate(year = if_else(is.na(year), fixed_date, year))
additional_news_data <- additional_news_data %>% 
  mutate(year = str_sub(date, 7, 10))
all_real_data_tokens %>% 
  filter(is.na(keyword))
## # A tibble: 64,662 × 11
##    journal untidy_date Description            keyword source   date  Title word 
##    <chr>   <chr>       <chr>                  <chr>   <chr>    <chr> <chr> <chr>
##  1 <NA>    <NA>        Gaza’s jour­nal­ists ar… <NA>    additio… 23-1… Gaza… gaza…
##  2 <NA>    <NA>        Gaza’s jour­nal­ists ar… <NA>    additio… 23-1… Gaza… jour…
##  3 <NA>    <NA>        Gaza’s jour­nal­ists ar… <NA>    additio… 23-1… Gaza… tar­… 
##  4 <NA>    <NA>        Gaza’s jour­nal­ists ar… <NA>    additio… 23-1… Gaza… ca­s… 
##  5 <NA>    <NA>        Gaza’s jour­nal­ists ar… <NA>    additio… 23-1… Gaza… is­r… 
##  6 <NA>    <NA>        Gaza’s jour­nal­ists ar… <NA>    additio… 23-1… Gaza… war  
##  7 <NA>    <NA>        Hun­dreds of Pales­tini… <NA>    additio… 23-1… Is­r…  is­r… 
##  8 <NA>    <NA>        Hun­dreds of Pales­tini… <NA>    additio… 23-1… Is­r…  or­d… 
##  9 <NA>    <NA>        Hun­dreds of Pales­tini… <NA>    additio… 23-1… Is­r…  death
## 10 <NA>    <NA>        Hun­dreds of Pales­tini… <NA>    additio… 23-1… Is­r…  cor­… 
## # ℹ 64,652 more rows
## # ℹ 3 more variables: title_or_description <chr>, year <dbl>, fixed_date <dbl>
all_real_data_tokens %>% 
  filter(year == 2023) %>% 
  mutate(word = str_squish(word)) %>% 
  filter(word == "israel")
## # A tibble: 465 × 11
##    journal              untidy_date Description keyword source date  Title word 
##    <chr>                <chr>       <chr>       <chr>   <chr>  <chr> <chr> <chr>
##  1 EEAS                 23.10.2023  "The EU's … palest… my_sc… <NA>  Isra… isra…
##  2 The Economist        Dec 7, 2023 "IF YOU WA… palest… my_sc… <NA>  Isra… isra…
##  3 Carnegie Endowment … Nov 30, 20… "Navigatin… palest… my_sc… <NA>  Navi… isra…
##  4 UN News              2023-10-24  "On day 17… palest… my_sc… <NA>  Isra… isra…
##  5 UN News              2023-10-29  "The crisi… palest… my_sc… <NA>  Isra… isra…
##  6 Anadolu Ajansı       2023-11-24  "BBC repor… palest… my_sc… <NA>  BBC … isra…
##  7 The Conversation     17 Oct 2023 "The joint… palest… my_sc… <NA>  How … isra…
##  8 Human Rights Watch   26 Oct 2023 "Responses… palest… my_sc… <NA>  Isra… isra…
##  9 Al Jazeera           27 Nov 2023 "In 1947, … palest… my_sc… <NA>  Isra… isra…
## 10 GOV.UK               27 Sept 20… "The UK ca… palest… my_sc… <NA>  The … isra…
## # ℹ 455 more rows
## # ℹ 3 more variables: title_or_description <chr>, year <dbl>, fixed_date <dbl>
  # all_real_data_tokens$word <- gsub("[[:space:]]", "", all_real_data_tokens$word)
  # 
  all_real_data_tokens$word <- str_replace_all(all_real_data_tokens$word, "­", "")